#
#remove scientific notation
options(scipen=999)
library(stringr)
library(corrplot)
## corrplot 0.84 loaded
library(shiny)
load("Data/county_factors.rda")
load("Data/county_500CitiesData.rda")
data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/"
# Read in the data
US.deaths <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = T, stringsAsFactors = F)
US.cases <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = T, stringsAsFactors = F)
# Read in the header seprately.
US.cases.head <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = F, stringsAsFactors = F)[1,]
US.deaths.head <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = F, stringsAsFactors = F)[1,]
# Correct the dates in the header to be more useable as
# column names.
proper_date <- function(dates){
dates <- sapply(dates, strsplit, split = "/")
dates <- lapply(dates, str_pad, width = 2, side = "left", pad = "0")
dates <- lapply(dates, paste, collapse = "_")
dates <- unlist(dates)
return(dates)
}
dates.cases <- proper_date(US.cases.head[-c(1:11)])
dates.deaths <- proper_date(US.deaths.head[-c(1:12)])
names(US.cases) <- c(US.cases.head[1,1:11], dates.cases)
names(US.deaths) <- c(US.deaths.head[1,1:12], dates.deaths)
if(sum(US.cases$UID != US.deaths$UID) > 0){warning("COVID data rows do not match!")}
US.cases$Population <- US.deaths$Population
US.cases <- US.cases[,c(1:11, ncol(US.cases), 12:(ncol(US.cases)-1))]
data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/"
daily_filenames <- list.files(data.path)
daily_filenames <- daily_filenames[daily_filenames != "README.md"]
todays_report_filename <- daily_filenames[length(daily_filenames)]
US.todaysReport <- read.csv(
paste0(data.path, todays_report_filename),
header = T, stringsAsFactors = F)
all.states <- c('Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Diamond Princess', 'District of Columbia', 'Florida', 'Georgia', 'Grand Princess', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming')
all.states.df <- data.frame(Province_State = all.states)
all.stats <- c("Confirmed", "Deaths", "Recovered", "Active", "Incident_Rate", "People_Tested", "People_Hospitalized", "Mortality_Rate", "Testing_Rate", "Hospitalization_Rate")
compiled.stats <- list()
for(i in 1:length(daily_filenames)){
day <- substring(daily_filenames[i],1,10)
data <- read.csv(
paste0(data.path, daily_filenames[i]),
header = T, stringsAsFactors = F)
compiled.stats[[i]] <- merge(all.states.df, data, all.y = F)
names(compiled.stats)[i] <- day
}
plot.dailyStat <- function(state, stat){
data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
names(data) <- daily_filenames
barplot(data, main = paste0(state, " ", stat), las = 2, cex.axis = 1, cex.names = 0.5)
}
plot.dailyStatRise <- function(state, stat){
data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
names(data) <- daily_filenames
rise.stat <- matrix(ncol = length(data) - 1, nrow = 1)
colnames(rise.stat) <- names(data)[-1]
for(i in 1:ncol(rise.stat) + 1){
rise <- data[i] - data[i-1]
rise.stat[i-1] <- rise
}
barplot(rise.stat, main = paste0(state, " rise in ",stat), las = 2, cex.axis = 1, cex.names = 0.5)
}
Province_State - The name of the State within the USA. Country_Region - The name of the Country (US). Last_Update - The most recent date the file was pushed. Lat - Latitude. Long_ - Longitude. Confirmed - Aggregated confirmed case count for the state. Deaths - Aggregated Death case count for the state. Recovered - Aggregated Recovered case count for the state. Active - Aggregated confirmed cases that have not been resolved (Active = Confirmed - Recovered - Deaths). FIPS - Federal Information Processing Standards code that uniquely identifies counties within the USA. Incident_Rate - confirmed cases per 100,000 persons. People_Tested - Total number of people who have been tested. People_Hospitalized - Total number of people hospitalized. Mortality_Rate - Number recorded deaths * 100/ Number confirmed cases. UID - Unique Identifier for each row entry. ISO3 - Officialy assigned country code identifiers. Testing_Rate - Total number of people tested per 100,000 persons. Hospitalization_Rate - Total number of people hospitalized * 100/ Number of confirmed cases.
US.cases.info <- as.matrix(US.cases[,1:12])
US.cases.data <- as.matrix(US.cases[,-c(2:12)])
US.deaths.info <- as.matrix(US.deaths[,1:12])
US.deaths.data <- as.matrix(US.deaths[,-c(2:12)])
rownames(US.cases.info) <- US.cases.info[,1]
US.cases.info <- US.cases.info[,-1]
rownames(US.cases.data) <- US.cases.data[,1]
US.cases.data <- US.cases.data[,-1]
rownames(US.deaths.info) <- US.deaths.info[,1]
US.deaths.info <- US.deaths.info[,-1]
rownames(US.deaths.data) <- US.deaths.data[,1]
US.deaths.data <- US.deaths.data[,-1]
ndays.cases <- ncol(US.cases.data)
ndays.deaths <- ncol(US.deaths.data)
nobs <- nrow(US.cases.data)
state.curve <- function(state, stat = c("cases", "deaths"), logScale = T){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Province_State == state),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Province_State == state),]
}
data.sum <- colSums(data)
day.first.case <- min(which(data.sum > 0))
n.days <- length(data.sum)
if(logScale == T){
barplot(data.sum[day.first.case:n.days],
main = paste0("Total COVID-19 ", stat," by date in ", state, ", log scale"),
log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}else{
barplot(data.sum[day.first.case:n.days],
main = paste0("Total COVID-19 ", stat," by date in ", state),
las = 2, cex.axis = 1, cex.names = 0.5)
}
}
state.rise <- function(state, stat = c("cases", "deaths")){
if(stat == "cases"){
data.thisState <- US.cases.data[which(US.cases$Province_State == state),]
}else if(stat == "deaths"){
data.thisState <- US.deaths.data[which(US.deaths$Province_State == state),]
}
data.sum <- colSums(data.thisState)
n.days <- ncol(data.thisState)
rise.cases <- matrix(ncol = n.days - 1, nrow = 1)
colnames(rise.cases) <- colnames(data.thisState)[-1]
for(i in 1:ncol(rise.cases) + 1){
rise <- data.sum[i] - data.sum[i-1]
rise.cases[i-1] <- rise
}
day.first.case <- min(which(rise.cases > 0))
n.days <- length(rise.cases)
barplot(rise.cases[,day.first.case:n.days], main = paste0("Rise in COVID-19 ", stat, " by Date in ", state), las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve <- function(county, stat = c("cases", "deaths")){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Admin2 == county),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Admin2 == county),]
}
day.first.case <- min(which(data > 0))
n.days <- length(data)
barplot(data[day.first.case:n.days], main = paste0("Total COVID-19 ", stat," by date in ", county), log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve("Tulsa", "cases")
county.curve("Tulsa", "deaths")
US.stats <- data.frame(UID = US.cases$UID)
cases.total <- colSums(US.cases.data)
day.first.case <- min(which(cases.total > 100))
n.days <- length(cases.total)
par(mar = c(5,5,4,2))
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
deaths.total <- colSums(US.deaths.data)
day.first.case <- min(which(deaths.total > 0))
n.days <- length(deaths.total)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
avg.rise.cases
rise.cases <- matrix(ncol = ndays.cases - 1, nrow = nobs)
colnames(rise.cases) <- colnames(US.cases.data)[-1]
for(i in 1:ncol(rise.cases) + 1){
rise <- US.cases.data[,i] - US.cases.data[,i-1]
rise.cases[,i-1] <- rise
}
US.stats$avg.rise.cases <- apply(rise.cases, 1, mean)
rise.cases.total <- colSums(rise.cases)
day.first.case <- min(which(rise.cases.total > 0))
n.days <- length(rise.cases.total)
barplot(rise.cases.total[day.first.case:n.days], main = "Rise in Cases of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
avg.rise.deaths
rise.deaths <- matrix(ncol = ndays.deaths - 1, nrow = nobs)
colnames(rise.deaths) <- colnames(US.deaths.data)[-1]
for(i in 1:ncol(rise.deaths) + 1){
rise <- US.deaths.data[,i] - US.deaths.data[,i-1]
rise.deaths[,i-1] <- rise
}
US.stats$avg.rise.deaths <- apply(rise.deaths, 1, mean)
rise.deaths.total <- colSums(rise.deaths)
day.first.case <- min(which(rise.deaths.total > 0))
n.days <- length(rise.deaths.total)
barplot(rise.deaths.total[day.first.case:n.days], main = "Rise in Deaths of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
total.cases
US.stats$total.cases <- US.cases.data[,ndays.cases]
US.stats$total.cases.percap <- US.stats$total.cases / US.cases$Population
US.stats$total.cases.percap[US.cases$Population == 0] <- NA
hist(US.stats$total.cases.percap)
total.deaths
US.stats$total.deaths <- US.deaths.data[,ndays.deaths]
total.deaths.percap
US.stats$total.deaths.percap <- US.stats$total.deaths / US.deaths$Population
US.stats$total.deaths.percap[US.deaths$Population == 0] <- NA
max(US.stats$total.deaths.percap,na.rm = T)
## [1] 0.002597528
total.deaths.percase Error in Johns Hopkins data has rows with total.deaths > total.cases.
# pos.case.ind <- US.stats$total.cases > 0
# US.stats$total.deaths.percase[pos.case.ind] <- US.stats$total.deaths[pos.case.ind] / US.stats$total.cases[pos.case.ind]
# US.stats$total.deaths.percase[!pos.case.ind] <- 0
US.stats$total.deaths.percase <- US.stats$total.deaths / US.stats$total.cases
US.stats$total.deaths.percase[US.stats$total.cases == 0] <- NA
US.stats[which(US.stats$total.deaths > US.stats$total.cases),]
## UID avg.rise.cases avg.rise.deaths total.cases
## 3203 84090002 0.0000000 0.04395604 0
## 3206 84090006 0.0000000 0.02197802 0
## 3222 84090024 0.0000000 1.18681319 0
## 3229 84090031 0.1318681 0.21978022 12
## 3231 84090033 0.1868132 0.49450549 17
## 3250 84090054 0.0000000 0.21978022 0
## 3252 84090056 0.0000000 0.05494505 0
## total.cases.percap total.deaths total.deaths.percap
## 3203 NA 4 NA
## 3206 NA 2 NA
## 3222 NA 108 NA
## 3229 NA 20 NA
## 3231 NA 45 NA
## 3250 NA 20 NA
## 3252 NA 5 NA
## total.deaths.percase
## 3203 NA
## 3206 NA
## 3222 NA
## 3229 1.666667
## 3231 2.647059
## 3250 NA
## 3252 NA
US.stats$ID <- str_pad(US.stats$UID, 8, "left", pad = "0")
US.stats$ID <- substr(US.stats$ID, 4, 8)
data.merge <- merge(US.stats, county_factors, by = "ID")
data.cor <- cor(data.merge[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
data.merge2 <- merge(data.merge, county_500CitiesData, by = "ID", all.x = F)
data.cor2 <- cor(data.merge2[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor2, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
corrplot.mixed(data.cor2[1:7,8:42], upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)